library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(knitr)
library(gtsummary)
library(tidyr)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(mapview)


ufo_sightings_transformed <- read_csv("ufo-sightings-transformed.csv")
## New names:
## Rows: 80328 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (9): date_documented, Season, Country_Code, Country, Region, Locale, UF... dbl
## (7): ...1, Year, Month, Hour, latitude, longitude, length_of_encounter_... dttm
## (1): Date_time
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Corece Into Factors

 ufo_sightings_transformed <- ufo_sightings_transformed %>% 
      mutate(Season = factor(Season),
             Country_Code = factor(Country_Code),
             Country = factor(Country),
             UFO_shape = factor(UFO_shape),
             Country = factor(Country),
             Year = factor(Year),
             Hour = factor(Hour),
             Region = factor(Region)
      )

Summary Stats

library(dplyr)
library(gtsummary)


# Create the summary table
UFO_summary_table <- ufo_sightings_transformed %>%
  select(length_of_encounter_seconds, Season,Month) %>%
  gtsummary::tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"),
    )
# Print the summary table
UFO_summary_table
Characteristic N = 80,3281
length_of_encounter_seconds 9,017 (620,232)
Season
    Autumn 21,735 (27%)
    Spring 16,268 (20%)
    Summer 26,307 (33%)
    Winter 16,018 (20%)
Month 6.8 (3.2)
1 Mean (SD); n (%)

Including Plots

You can also embed plots, for example:

# Install and load necessary packages
if (!requireNamespace("ggplot2", quietly = TRUE)) {
  install.packages("ggplot2")
}
library(ggplot2)

# Create a bar plot with reduced width
ggplot(ufo_sightings_transformed, aes(x = Season)) +
  geom_bar(fill = "skyblue", color = "darkblue", width = 0.5) +
  labs(title = "Bar Plot For UFO Sightings appeared in Different Seasons", x = "Season", y = "Count") +
  theme_minimal()

ufo_sightings_transformed <- na.omit(ufo_sightings_transformed)

1.Write the hypotheses

\(H_0:\) The average UFO sighting in California Region is equal to that of the average UFO sightings in England Region.

\(\mu_{cal}=\mu_{eng}\)

\(H_A:\) The average UFO sighting in California Region is greater than that of the average UFO sightings in England Region.

\(\mu_{cal} > \mu_{eng}\)

2.Check Conditions

california_england_data <- ufo_sightings_transformed %>%
  filter(Region %in% c("California", "England")) %>%
  filter(!is.na(Region)) %>%
  filter(length_of_encounter_seconds >= 0.1, length_of_encounter_seconds <= 15000, is.finite(length_of_encounter_seconds))

# Sample 1000 rows including both regions
sampled_data <- california_england_data %>%
  group_by(Region) %>%
  sample_n(500) %>%
  ungroup()

sampled_data %>% 
  ggplot(aes(length_of_encounter_seconds, fill = Region)) +
  geom_histogram(binwidth = 200, col = "white", show.legend = FALSE) +
  facet_wrap(~ Region) +
  labs(title = "Length of Encounter Seconds vs. Region") +
  scale_x_continuous(breaks = seq(0, 5000, by = 500), limits = c(0, 5000))
## Warning: Removed 24 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 4 rows containing missing values (`geom_bar()`).

3. Test Statistic

California_ufo_sightings <- ufo_sightings_transformed %>%
  filter(Region == "California", !is.na(length_of_encounter_seconds)) %>%
  select(length_of_encounter_seconds) %>%
  pull()

england_ufo_sightings <- ufo_sightings_transformed %>%
  filter(Region == "England", !is.na(length_of_encounter_seconds)) %>%
  select(length_of_encounter_seconds) %>%
  pull()



n1 <- length(California_ufo_sightings)
n2 <- length(england_ufo_sightings)

\(n_1\) = 9374

\(n_2\) = 1885

xbar1 <- mean(California_ufo_sightings)
xbar2 <- mean(england_ufo_sightings)

s1 <- sd(California_ufo_sightings)
s2 <- sd(england_ufo_sightings)

stat <- xbar1 - xbar2
null_value <- 0
se <- sqrt(s1^2/n1 + s2^2/n2)

df <- min(c(n1, n2)) - 1

t_stat <- (stat - null_value) / se 

\(\mu_{cal}\)=3478.5996

\(\mu_{eng}\)=6.6377716^{4}

\(t_{stat}\) = -1.1732

4.P-Value

p_val <- pt(t_stat, df = df, lower.tail = FALSE)

\(p-value\) = 0.8796

Decision:Fail to reject \(H_0\)

Conclusion: We do not enough evidence that the difference in average of UFO sightings in California is greater than that average of UFO sightings in England.

t.test(California_ufo_sightings, england_ufo_sightings, alternative = "greater",
       conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  California_ufo_sightings and england_ufo_sightings
## t = -1.1732, df = 1886.3, p-value = 0.8796
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -151126.7       Inf
## sample estimates:
## mean of x mean of y 
##   3478.60  66377.72

MapView For Califorina

MapView Englnd Region

WORLD-MAP